1. Import Data and Library
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
library(Seurat)
## Attaching SeuratObject
library(SeuratData)
## Registered S3 method overwritten by 'cli':
## method from
## print.boxx spatstat.geom
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
## ── Installed datasets ───────────────────────────────────── SeuratData v0.2.1 ──
## ✓ bmcite 0.3.0 ✓ pbmcMultiome 0.1.0
## ────────────────────────────────────── Key ─────────────────────────────────────
## ✓ Dataset loaded successfully
## > Dataset built with a newer version of Seurat than installed
## ❓ Unknown version of Seurat installed
library(cowplot)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load("PCA_seurat.Rdata")
# # Just an example way to preprocess data, to demonstrate the PCA part
# # You could use SCTransform into RunPCA also
# bm <- LoadData(ds = "bmcite")
# bm <- Seurat::NormalizeData(bm) %>% Seurat::FindVariableFeatures() %>% Seurat::ScaleData() %>% Seurat::RunPCA()
# celltype_factor <- as.numeric(as.factor(bm@meta.data$celltype.l2))
# scale_mat <- t(bm[["RNA"]]@scale.data[bm[["RNA"]]@var.features,])
# dim(scale_mat); round(scale_mat[1:5,1:5],2)
# idx1 <- which(colnames(scale_mat) == "IGHA1")
# idx2 <- which(colnames(scale_mat) == "GYPE")
# plot(scale_mat[,idx1], scale_mat[,idx2], asp = T, col = celltype_factor,
# xlab = colnames(scale_mat)[idx1],
# ylab = colnames(scale_mat)[idx2])
# This is what plotting the scale.data matrix looks like
# It still looks like it's very aligned with the axes
# pca_obj <- bm[["pca"]]
# dim(pca_obj@cell.embeddings)
# dim(pca_obj@feature.loadings)
# pca_mat <- pca_obj@cell.embeddings %*% t(pca_obj@feature.loadings)
# dim(pca_mat); round(pca_mat[1:5,1:5],2)
# idx1 <- which(colnames(pca_mat) == "IGHA1")
# idx2 <- which(colnames(pca_mat) == "GYPE")
# plot(pca_mat[,idx1], pca_mat[,idx2], asp = T, col = cell_type,
# xlab = colnames(pca_mat)[idx1],
# ylab = colnames(pca_mat)[idx2])
# By plotting the low-rank matrix, some of the structure is more visible
# You can verify that pca_mat is indeed low-rank by running
# Matrix::rankMatrix(pca_mat) (it should return 50), but this line takes a few minutes to run
# rand_ind <- c()
# for (cell in cell_labels){
# set.seed(10)
#
# subcell_ind <- which(cell_type == cell)
# subcell_len <- length(subcell_ind)
# subcell_mat <- pca_mat[subcell_ind, ]
#
# row_ind <- apply(subcell_mat, 1, function(x){length(which(x != 0))})
# idx <- order(row_ind, decreasing = T)
#
# rand_ind <- c(rand_ind, idx[1:(subcell_len/30)])
# }
#
# rand_ind <- c()
# for (cell in cell_labels){
# set.seed(10)
# subcell_ind <- which(cell_type == cell)
# sub_rand <- sample(length(subcell_ind),
# length(subcell_ind)/20)
# rand_ind <- c(rand_ind, subcell_ind[sub_rand])
# }
#
# sub_dat <- pca_mat[rand_ind, ]
#
# col_ind <- apply(sub_dat, 2, function(x){length(which(x != 0))})
# idx <- order(col_ind, decreasing = T)[1:500]
#
# sub_dat <- sub_dat[, idx]
#
# dat_hclust <- hclust(dist(t(sub_dat)))
# dat_index <- dat_hclust$order
#
# sub_dat <- sub_dat[, dat_index]
# sub_celltype <- cell_type[rand_ind]
sub_cluster_labels <- as.numeric(as.factor(sub_celltype))
# save(bm, pca_mat, cell_type, sub_dat, sub_celltype, cormat_list, file = "PCA_seurat.Rdata")
2-1. Dependency Measures
library(reshape2) # melt function
library(ggplot2) # ggplot function
library(pcaPP) # Fast Kendall function
library(energy) # Distance Correlation
library(Hmisc) # Hoeffding's D measure
library(zebu) # Normalized Mutual Information
# library(minerva) # Maximum Information Coefficient
library(XICOR) # Chatterjee's Coefficient
# library(dHSIC) # Hilbert Schmidt Independence Criterion
library(VineCopula) # Blomqvist's Beta
make_cormat <- function(dat_mat){
matrix_dat <- matrix(nrow = ncol(dat_mat), ncol = ncol(dat_mat))
cor_mat_list <- list()
basic_cor <- c("pearson", "spearman")
# find each of the correlation matrices with Pearson, Spearman, Kendall Correlation Coefficients
for (i in 1:2){
print(i)
cor_mat <- stats::cor(dat_mat, method = basic_cor[i])
cor_mat[upper.tri(cor_mat, diag = T)] <- NA
cor_mat_list[[i]] <- cor_mat
}
# functions that take matrix or data.frame as input
no_loop_function <- c(pcaPP::cor.fk, Hmisc::hoeffd,
VineCopula::BetaMatrix)
for (i in 3:5){
print(i)
fun <- no_loop_function[[i-2]]
cor_mat <- fun(dat_mat)
if (i == 4){ # Hoeffding's D
cor_mat <- cor_mat$D
}
cor_mat[upper.tri(cor_mat, diag = T)] <- NA
cor_mat_list[[i]] <- cor_mat
}
# functions that take two variables as input to calculate correlations.
need_loop <- c(zebu::lassie, energy::dcor2d, XICOR::calculateXI)
for (i in 6:8){
print(i)
fun <- need_loop[[i-5]]
cor_mat <- matrix(nrow = ncol(dat_mat),
ncol = ncol(dat_mat))
for (j in 2:ncol(dat_mat)){
for (k in 1:(j-1)){
if (i == 6){
cor_mat[j, k] <- fun(cbind(dat_mat[, j], dat_mat[, k]), continuous=c(1,2), breaks = 6, measure = "npmi")$global
} else {
cor_mat[j, k] <- fun(as.numeric(dat_mat[, j]),
as.numeric(dat_mat[, k]))
}
}
}
cor_mat[upper.tri(cor_mat, diag = T)] <- NA
cor_mat_list[[i]] <- cor_mat
}
return(cor_mat_list)
}
draw_heatmap <- function(cor_mat){
len <- 5
melted_cormat <- melt(cor_mat)
melted_cormat <- melted_cormat[!is.na(melted_cormat$value),]
break_vec <- round(as.numeric(quantile(melted_cormat$value,
probs = seq(0, 1, length.out = len),
na.rm = T)),
4)
break_vec[1] <- break_vec[1]-1
break_vec[len] <- break_vec[len]+1
melted_cormat$value <- cut(melted_cormat$value, breaks = break_vec)
heatmap_color <- unique(melted_cormat$value)
heatmap <- ggplot(data = melted_cormat, aes(x = Var2, y = Var1, fill = value))+
geom_tile(colour = "Black") +
ggplot2::scale_fill_manual(breaks = sort(heatmap_color),
values = rev(scales::viridis_pal(begin = 0, end = 1)
(length(heatmap_color)))) +
theme_bw() + # make the background white
theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.ticks = element_blank(),
# erase tick marks and labels
axis.text.x = element_blank(), axis.text.y = element_blank())
return (heatmap)
}
make_cor_heatmap <- function(dat_mat, cor_mat_list){
fun_lable <- c("Pearson's Correlation", "Spearman's Correlation", "Kendall's Correlation",
"Hoeffding's D", "Blomqvist's Beta","Distance Correlation",
"NMI", "XI Correlation")
cor_heatmap_list <- list()
cor_abs_heatmap_list <- list()
# make correlation matrices
#cor_mat_list <- make_cormat(dat_mat)
for (i in 1:8){
print(i)
cor_mat <- abs(cor_mat_list[[i]])
# get heatmaps
cor_heatmap <- draw_heatmap(cor_mat)
# ggplot labels
ggplot_labs <- labs(title = paste("Heatmap of", fun_lable[i]),
x = "",
y = "",
fill = "Coefficient") # change the title and legend label
cor_heatmap_list[[i]] <- cor_heatmap + ggplot_labs
if (i %in% c(1,2,3,4,6)){
cor_abs_mat <- abs(cor_mat_list[[i]])
cor_abs_heatmap <- draw_heatmap(cor_abs_mat)
ggplot_abs_labs <- labs(title = paste("Abs Heatmap of", fun_lable[i]),
x = "", # change the title and legend label
y = "",
fill = "Coefficient")
cor_abs_heatmap_list[[i]] <- cor_abs_heatmap + ggplot_abs_labs
} else {
ggplot_abs_labs <- labs(title = paste("Abs Heatmap of", fun_lable[i]),
subtitle = "Equivalent to Non-Abs Heatmap",
x = "", # change the title and legend label
y = "",
fill = "Coefficient")
cor_abs_heatmap_list[[i]] <- cor_heatmap + ggplot_abs_labs
}
}
ans <- list(cor_heatmap_list, cor_abs_heatmap_list)
return (ans)
}
load("s_seurat_corr.RData")
# cormat_list <- make_cormat(sub_dat)
heatmap_list <- make_cor_heatmap(sub_dat, cormat_list)
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
cor_pearson_mat <- cormat_list[[1]]; cor_spearman_mat <- cormat_list[[2]];
cor_kendall_mat <- cormat_list[[3]]; cor_hoeffd_mat <- cormat_list[[4]];
cor_blomqvist_mat <- cormat_list[[5]]; cor_dist_mat <- cormat_list[[6]];
cor_MI_mat <- cormat_list[[7]]; cor_XI_mat <- cormat_list[[8]];
1. Pearson’s correlation coefficient
- Pearson’s correlation is to measure linear dependency of data, X and Y
- \(-1 \leq \rho_{Pearson}(X, Y) \leq 1\)
- \(\rho_{Pearson}(X, Y) = \frac{\sum(x_i-\bar{x})(y_i -\bar{y})}{\sum(x_i-\bar{x})^2(y_i -\bar{y})^2}\)
cor_pearson_mat[1:5,1:5]
## FAM13A SLC2A1 SMIM1 SASS6 FH
## FAM13A NA NA NA NA NA
## SLC2A1 0.07933118 NA NA NA NA
## SMIM1 0.07762977 0.153925319 NA NA NA
## SASS6 0.01104678 0.160323299 0.1670105 NA NA
## FH 0.02052069 0.002889402 0.1038307 0.07918712 NA
quantile(cor_pearson_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## -0.459861079 -0.005412456 0.021230432 0.061425784 0.849321895
quantile(abs(cor_pearson_mat), na.rm = T)
## 0% 25% 50% 75% 100%
## 7.684486e-07 9.049377e-03 2.756643e-02 6.259634e-02 8.493219e-01
# plot the smallest correlations
cor_pearson_vec <- sort(abs(cor_pearson_mat), decreasing = T)
plot(cor_pearson_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_pearson_mat) == cor_pearson_vec[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_pearson_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_pearson_mat) == rev(cor_pearson_vec)[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_pearson_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[1]]

2. Spearman’s correlation coefficient
- It captures the monotonic relationship between data, X and Y
- \(-1 \leq \rho_{Spearman}(X,Y) \leq 1\)
- \(\rho_{Spearman} = 1 - \frac{6\sum{d_i^2}}{n(n^2-1)}\) where \(d_i\) is the difference between the ranks of \(x_i\) and \(y_i\)
cor_spearman_mat[1:5,1:5]
## FAM13A SLC2A1 SMIM1 SASS6 FH
## FAM13A NA NA NA NA NA
## SLC2A1 0.1364108 NA NA NA NA
## SMIM1 0.2389259 0.21482350 NA NA NA
## SASS6 0.1242811 0.18918435 0.3242491 NA NA
## FH 0.1214335 0.07247562 0.2364899 0.2033483 NA
quantile(cor_spearman_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## -0.440495146 -0.003897269 0.065754765 0.143688187 0.712296031
quantile(abs(cor_spearman_mat), na.rm = T)
## 0% 25% 50% 75% 100%
## 9.615496e-08 1.368600e-02 6.719353e-02 1.438421e-01 7.122960e-01
# plot the smallest correlations
cor_spearman_vec <- sort(abs(cor_spearman_mat), decreasing = T)
plot(cor_spearman_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_spearman_mat) == cor_spearman_vec[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_spearman_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_spearman_mat) == rev(cor_spearman_vec)[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_spearman_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[2]]

3. Kendall’s Tau
- It is an alternative method to Spearman’s correlations, identifying monotonic relationships.
- \(-1 \leq \rho_{Kendall}(X,Y) \leq 1\)
- \(\rho_{Kendall}(X,Y) = \frac{\#\;concordant\;pairs - \#\;discordant \;pairs}{0.5n(n-1)}\)
cor_kendall_mat[1:5,1:5]
## FAM13A SLC2A1 SMIM1 SASS6 FH
## FAM13A NA NA NA NA NA
## SLC2A1 0.1343928 NA NA NA NA
## SMIM1 0.2342984 0.21127365 NA NA NA
## SASS6 0.1223270 0.18686430 0.3193021 NA NA
## FH 0.1191061 0.07104971 0.2316636 0.1998652 NA
quantile(cor_kendall_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## -0.32474937 -0.00384040 0.06228777 0.13779609 0.70687460
quantile(abs(cor_kendall_mat), na.rm = T)
## 0% 25% 50% 75% 100%
## 0.00000000 0.01325622 0.06343508 0.13789744 0.70687460
# plot the smallest correlations
cor_kendall_vec <- sort(abs(cor_kendall_mat), decreasing = T)
plot(cor_kendall_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_kendall_mat) == cor_kendall_vec[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_kendall_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_kendall_mat) == rev(cor_kendall_vec)[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_kendall_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[3]]

4. Hoeffding’s D
- It tests the independence of data by calculating the distance between the product of the marginal distributions under the null hypothesis and the empirical bi-variate distribution.
- \(-1 \leq D(X,Y) \leq 1\)
- \(D(X,Y) = \frac{(n-2)(n-3)D_1+D_2-2(n-2)D_3}{n(n-1)(n-2)(n-3)(n-4)}\)
- \(D_1 = \sum_{i=1}^{n} Q_i(Q_i-1)\)
- \(D_2 = \sum_{i=1}^{n} (R_i-1)(R_i-2)(S_j-1)(S_j-2)\)
- \(D_3 = \sum_{i=1}^{n} (R_i-2)(S_i-2)Q_i\)
cor_hoeffd_mat[1:5,1:5]
## FAM13A SLC2A1 SMIM1 SASS6 FH
## FAM13A NA NA NA NA NA
## SLC2A1 -0.001116789 NA NA NA NA
## SMIM1 -0.000982215 -0.001038485 NA NA NA
## SASS6 -0.001129462 -0.001107714 -0.0009209014 NA NA
## FH -0.001086316 -0.001125619 -0.0009165829 -0.001040697 NA
quantile(cor_hoeffd_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## -0.0012343631 -0.0011867196 -0.0011017526 -0.0009353402 0.1456133536
# plot the smallest correlations
cor_hoeffd_vec <- sort(abs(cor_hoeffd_mat), decreasing = T)
plot(cor_hoeffd_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_hoeffd_mat) == cor_hoeffd_vec[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_hoeffd_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_hoeffd_mat) == rev(cor_hoeffd_vec)[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_hoeffd_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[4]]

5. Blomqvist’s Beta
- It measures dependency between variables by constructing a two-way contingency table with the medians of each margin as cutting points.
- \(0 \leq \beta \leq 1\)
- \(\beta_n = \frac{n_1-n_2}{n_1+n_2} = \frac{2n_1}{n_1+n_2} - 1\)
- \(\beta = P\{(X-\tilde{x})(Y-\tilde{y})>0\} - P\{(X-\tilde{x})(Y-\tilde{y}) < 0\}\)
cor_blomqvist_mat[1:5,1:5]
## [,1] [,2] [,3] [,4] [,5]
## [1,] NA NA NA NA NA
## [2,] 0.8845144 NA NA NA NA
## [3,] 0.8832021 0.8858268 NA NA NA
## [4,] 0.8884514 0.9041995 0.9055118 NA NA
## [5,] 0.8543307 0.8543307 0.8661417 0.8792651 NA
quantile(cor_blomqvist_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## -0.2034121 0.6706037 0.8333333 0.9343832 1.0000000
quantile(abs(cor_blomqvist_mat), na.rm = T)
## 0% 25% 50% 75% 100%
## 0.0000000 0.6706037 0.8333333 0.9343832 1.0000000
# plot the smallest correlations
cor_blomqvist_vec <- sort(abs(cor_blomqvist_mat), decreasing = T)
plot(cor_blomqvist_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_blomqvist_mat) == cor_blomqvist_vec[i], arr.ind = T)
idx1 <- idx[i,1]; idx2 <- idx[i,2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_blomqvist_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(1,1))
for(i in 1:1){
idx <- which(abs(cor_blomqvist_mat) == rev(cor_blomqvist_vec)[i], arr.ind = T)
idx1 <- idx[i,1]; idx2 <- idx[i,2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_blomqvist_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[5]]

6. Distance Correlation
- it is measure to identify non-linear relationships between two random variables with energy distances.
- distance correlation is calculated by dividing the distance covariance between X and Y by the product of their distance standard deviations.
- \(0 \leq dCor \leq 1\)
- \(dCor(X,Y) = \frac{dCov(Y,Y)}{\sqrt{dVar(X)dVar(Y)}}\)
- \(dCov(X, Y) = \sqrt{\frac{1}{n^2} \sum_{k=1, l=1}^{n} A_{k,l}B_{k,l}}\)
- \(dVar(X) = dCov(X,X) and dVar(Y) = dCov(Y, Y)\)
cor_dist_mat[1:5,1:5]
## [,1] [,2] [,3] [,4] [,5]
## [1,] NA NA NA NA NA
## [2,] 0.14308226 NA NA NA NA
## [3,] 0.17112701 0.1804801 NA NA NA
## [4,] 0.07112053 0.1677991 0.3067075 NA NA
## [5,] 0.12274024 0.1111194 0.2082887 0.2617941 NA
quantile(cor_dist_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## 0.00000000 0.04749414 0.15603308 0.31203912 1.00000000
# plot the smallest correlations
cor_dist_vec <- sort(abs(cor_dist_mat), decreasing = T)
plot(cor_dist_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_dist_mat) == cor_dist_vec[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_dist_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_dist_mat) == rev(cor_dist_vec)[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_dist_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[6]]

7. Normalized Mutual Information
- It measures how much one random variable gives information about the other. For example, High mutual information indicates a large reduction in uncertainty.
- \(0 \leq MI(X,Y) \leq 1\), as it is normalized.
- $MI(X,Y) = f_{X,Y} (x,y) log_2 ; dxdy $
- \(MI(X,Y) = \sum \sum p_{X,Y} (x,y) log \frac{p_{X,Y} (x,y)}{P_X(x)P_Y(y)}\)
cor_MI_mat[1:5,1:5]
## [,1] [,2] [,3] [,4] [,5]
## [1,] NA NA NA NA NA
## [2,] 0.006835049 NA NA NA NA
## [3,] 0.020636128 0.025660357 NA NA NA
## [4,] 0.002255856 0.027580545 0.05271525 NA NA
## [5,] 0.004373342 0.001698769 0.02459874 0.01251397 NA
quantile(cor_MI_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## 0.0000000000 0.0001934935 0.0024023532 0.0096106548 0.6511522564
# plot the smallest correlations
cor_MI_vec <- sort(abs(cor_MI_mat), decreasing = T)
plot(cor_MI_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_MI_mat) == cor_MI_vec[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_MI_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_MI_mat) == rev(cor_MI_vec)[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_MI_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[7]]

8. Chatterjee’s XI Correlation
- It measures the degree of dependence between the variables with concept of rank.
- \(0 \leq \xi_n \leq 1\)
- \(\xi_n(X,Y) = 1 - \frac{3\sum_{i=1}^{n-1} |r_{i+1} -r_i|}{n^2-1}\)
- \(\xi_n(X,Y) = 1 - \frac{n\sum_{i=1}^{n-1}|r_{i+1}-r_i}{2\sum_{i=1}^{n}l_i(n-l_i)}\)
cor_XI_mat[1:5,1:5]
## [,1] [,2] [,3] [,4] [,5]
## [1,] NA NA NA NA NA
## [2,] 0.05738036 NA NA NA NA
## [3,] 0.05521768 0.05337987 NA NA NA
## [4,] 0.03297297 0.05379707 0.1172422 NA NA
## [5,] 0.03420879 0.02876524 0.0361869 0.02030034 NA
quantile(cor_XI_mat, na.rm = T)
## 0% 25% 50% 75% 100%
## -0.078105939 -0.005453346 -0.001314060 0.016424709 0.601551998
# plot the smallest correlations
cor_XI_vec <- sort(abs(cor_XI_mat), decreasing = T)
plot(cor_XI_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_XI_mat) == cor_XI_vec[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_XI_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
idx <- which(abs(cor_XI_mat) == rev(cor_XI_vec)[i], arr.ind = T)
idx1 <- idx[1]; idx2 <- idx[2]
plot(sub_dat[,idx1], sub_dat[,idx2], col = sub_cluster_labels, asp = T,
pch = 16, xlab = paste0(colnames(sub_dat)[idx1], ", (", idx1, ")"),
ylab = paste0(colnames(sub_dat)[idx2], ", (", idx2, ")"),
main = paste0("Correlation of ", round(cor_XI_mat[idx1, idx2], 3)))
}

Heatmap
heatmap_list[[1]][[8]]

quantile_mat <- c()
for (i in 1:length(cormat_list)){
quantile_mat <- rbind(quantile_mat,
quantile(abs(cormat_list[[i]]), probs = c(0.25, 0.85), na.rm = T))
}
rownames(quantile_mat) <- c("Pearson", "Spearman", "Kendall", "Hoeffding's D",
"Blomqvist's Beta","Dist. Corr", "NMI", "XI Corr")
quantile_mat
## 25% 85%
## Pearson 0.0090493773 0.085820744
## Spearman 0.0136860046 0.183951529
## Kendall 0.0132562171 0.178417554
## Hoeffding's D 0.0009391512 0.001208102
## Blomqvist's Beta 0.6706036745 0.960629921
## Dist. Corr 0.0474941401 0.418182424
## NMI 0.0001934935 0.015738374
## XI Corr 0.0030470712 0.032770827
save(quantile_mat, file = "seurat_quantile.RData")